# -*- coding: utf8 -*-
#Catch Xml() 抓取分类网页中各词条类别中文对应ascii码
import urllib2
import urllib
import time
import re
def add_func(url):
    sites=[]
    content=urllib2.urlopen(url).read()
    content=content.replace('\n',' ')
    part1_pat=re.compile(r'class="w-630 bor-e1 mar-t10 l f-com">(.*?)line-height:21px;">')
    part1=re.findall(part1_pat,content)#除去微百科部分
    parts_pat=re.compile(r'<a class="link_blue bold"(.*?)</dd></dl>')
    siteValues=re.findall(parts_pat,part1[0])
    time.sleep(0.001)
    category=[]
    for x in siteValues:
        x+='</a>'
        values_pat=re.compile(r'">(.*?)</a>')
        temp=re.findall(values_pat,x)
        category.append(temp)
    for i in category:
        for j in i:
            sites.append(j)
    print len(sites)
    return sites
#url=('http://fenlei.baike.com/地质学家/list')
#url=('http://fenlei.baike.com/')
#add_func(url)
